count_lines_of_code() {
local directory="$1"
local exclude_directory="$2"
local total_lines=0
if [ -z "$directory" ]; then
echo "Please provide a directory."
return 1
fi
if [ -z "$exclude_directory" ]; then
echo "Please provide a directory to exclude."
return 1
fi
# Find all .R and .Rmd files in the specified directory excluding the exclude_directory and count their lines
for file in $(find "$directory" -path "$exclude_directory" -prune -o -type f \( -name "*.R" -o -name "*.Rmd" \) -print); do
local lines=$(wc -l < "$file")
total_lines=$((total_lines + lines))
done
echo "Total lines of code in .R and .Rmd files in $directory excluding $exclude_directory: $total_lines"
}
# Call the function with a specific directory and exclude directory
count_lines_of_code "." "./packrat"
## Total lines of code in .R and .Rmd files in . excluding ./packrat: 1258
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'plotly'
##
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
##
## The following object is masked from 'package:stats':
##
## filter
##
##
## The following object is masked from 'package:graphics':
##
## layout
source("data_import.R", echo = F, local = knitr::knit_global())
##
## Attaching package: 'rio'
## The following object is masked from 'package:plotly':
##
## export
knitr::read_chunk('data_import.R')
Ilość rekordów we wszystkich ramkach danych
total_rows <- ls() %>%
set_names() %>%
map(~ get(.)) %>%
keep(is.data.frame) %>%
map_int(nrow) %>%
sum()
total_rows
## [1] 141812
total_rows > 30000
## [1] TRUE
Aby uniknąć duplikatów w kolumnach, należy usunąć kolumnę “Pos”
# source("cleaning.R", echo = F, local = knitr::knit_global())
knitr::read_chunk('cleaning.R')
# Data frame containing metadata about artists
OSet_artists <- import("clean_data/musicoset_metadata/artists.csv")
# OSet_artists genre variables unification
OSet_artists <- OSet_artists %>%
mutate(main_genre = case_when(
grepl("rap", main_genre) ~ "rap",
grepl("hip hop", main_genre) ~ "rap",
grepl("drill", main_genre) ~ "rap",
grepl("rock", main_genre) ~ "rock",
grepl("country", main_genre) ~ "country",
grepl("r&b", main_genre) ~ "r&b",
grepl("edm", main_genre) ~ "edm",
grepl("pop", main_genre) ~ "pop",
grepl("soul", main_genre) ~ "soul",
.default = as.character(main_genre)
))
OSet_artists_genre <- OSet_artists %>%
select(c("name", "main_genre"))
Przetwarzanie, aby uzyskać ramkę danych z najchętniej słuchanymi gatunkami i ramkę danych z najchętniej słuchanymi artystami ogólnie
# eu_artisttotals - only in europe
# eua_artisttotals - based on albums - only in europe
# ww_artisttotals
# spotify_listeners
# youtube_archive
# apple_songs_artisttotals
# apple_songs_eu_artisttotals - only in europe
colnames(OSet_artists_genre) <- c("Artist", "Genre")
artists_genre_comparison <- ww_artisttotals%>%
inner_join(
spotify_listeners,
by = "Artist",
suffix = c(".ww", ".spotify")
) %>%
inner_join(
youtube_archive,
by = "Artist",
suffix = c(".ww", ".youtube")
) %>%
inner_join(
apple_songs_artisttotals,
by = "Artist",
suffix = c(".ww", ".apple")
) %>%
inner_join(
OSet_artists_genre,
by = "Artist",
suffix = c(".ww", ".genre")
)
# Test to see if the are no duplicate rows
artists_genre_comparison %>%
filter(Artist == "Adele")
## Artist Total.ww Today.ww 1M.ww 2M.ww 3M.ww 4M.ww 5M.ww Listeners
## 1 Adele 82,855,252 8,797 10 8 6 5 5 50,515,140
## Daily Trend Peak PkListeners Total.youtube 100M Total Today.apple
## 1 104,639 31 58,244,524 13,707.3 18 11,442,493 1,377
## 1M.apple 2M.apple 3M.apple 4M.apple 5M.apple Genre
## 1 3 1 1 1 0 soul
# Column cleaning
artists_genre_comparison$Total.youtube <- str_replace_all(artists_genre_comparison$Total.youtube, ",","")
artists_genre_comparison$Total.ww <- str_replace_all(artists_genre_comparison$Total.ww, ",", "")
artists_genre_comparison$PkListeners <- str_replace_all(artists_genre_comparison$PkListeners, ",", "")
artists_genre_comparison$Total <- str_replace_all(artists_genre_comparison$Total, ",", "")
artists_genre_comparison$Listeners <- str_replace_all(artists_genre_comparison$Listeners , ",", "")
artists_genre_comparison <- artists_genre_comparison %>%
transform(
Total.youtube = as.numeric(Total.youtube),
Total.ww = as.numeric(Total.ww),
PkListeners = as.numeric(PkListeners),
Total = as.numeric(Total),
Listeners = as.numeric(Listeners)
) %>%
mutate(
# Total.youtube column is equal to total views in millions
Total.youtube = as.numeric(Total.youtube) * 1000000
) %>%
rename(Total.apple = Total, Total.spotify = PkListeners)
# Sum of all the listeners and views across all of the dsp column
artists_genre_comparison <- artists_genre_comparison %>%
mutate(
Sum_listeners_views_across = Total.youtube + Total.ww + Total.apple + Total.spotify
) %>%
arrange(desc(Sum_listeners_views_across))
preferable_genres <- artists_genre_comparison %>%
select(Artist, Genre, Sum_listeners_views_across) %>%
group_by(Genre) %>%
count() %>%
arrange(desc(n))
head(preferable_genres)
## # A tibble: 6 × 2
## # Groups: Genre [6]
## Genre n
## <chr> <int>
## 1 pop 88
## 2 rap 26
## 3 rock 11
## 4 latin 6
## 5 country 5
## 6 r&b 4
ggplot(
head(preferable_genres),
aes(
x = reorder(Genre, -n),
y = n)
) + geom_bar(
stat = "identity",
width = 0.5,
fill = "#581845"
) + labs(
title = "Najczęciej słuchane gatunki"
) + xlab(
""
) + ylab(
""
) + theme_fivethirtyeight() + scale_fill_fivethirtyeight()
top_artists_total <- artists_genre_comparison %>%
select(Artist, Sum_listeners_views_across) %>%
top_n(100, wt = Sum_listeners_views_across)
# separate top artists for labeling
top_labels <- top_artists_total %>%
top_n(20, wt = Sum_listeners_views_across)
bot_labels <- top_artists_total %>%
top_n(-18, wt = Sum_listeners_views_across)
mid_labels <- top_artists_total %>%
slice(21:(nrow(top_artists_total) - 18))
top_artists_total %>%
ggplot(
aes(x = Artist, y = Sum_listeners_views_across, size = log(Sum_listeners_views_across))) +
geom_point(aes(alpha = Sum_listeners_views_across), color = "#7e7b77", fill = "#edc491", shape = 21) +
labs(
title = "Najczęściej streamowani artyści",
subtitle = "Globalnie - sumaryzacja"
) +
xlab("Artysta") +
ylab("Ilość streamów") +
scale_size(range = c(1, 20)) +
scale_alpha_continuous(range = c(0.3, 1)) +
scale_y_log10() +
theme_fivethirtyeight() + scale_fill_fivethirtyeight() +
theme(
axis.text.x = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
panel.grid.minor = element_line(color = "grey95", linewidth = 0.25),
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
) +
geom_text(
data = top_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2.5,
color = "#1b2322"
)
# TODO zrepelować tylko te labely które trzeba
# TODO zrobić go dla polski
ggplotly(top_artists_total %>%
ggplot(aes(x = Artist, y = Sum_listeners_views_across, size = log(Sum_listeners_views_across))) +
geom_point(aes(alpha = Sum_listeners_views_across), color = "#7e7b77", fill = "#edc491", shape = 21) +
xlab("Artysta") +
ylab("Ilość streamów") +
labs(
title = "Najczęściej streamowani artyści",
subtitle = "Globalnie - wersja interaktywna"
) +
scale_size(range = c(3, 15), name = "Sum of Listeners/Views") +
scale_alpha_continuous(range = c(0.3, 1)) +
scale_y_log10() +
theme_fivethirtyeight() +
scale_fill_fivethirtyeight() +
theme(
axis.text.x = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
panel.grid.minor = element_line(color = "grey95", linewidth = 0.25),
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
) +
geom_text(
data = top_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2.5,
color = "#1b2322"
) +
geom_text(
data = mid_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2,
color = "#1b2322"
) +
geom_text(
data = bot_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 1,
color = "#1b2322"
)
)
preferable_genres_spotify <- suppressWarnings({
spotify_artists %>%
mutate(across(c('Streams', 'Daily', 'As lead', 'Solo', 'As feature'), ~ str_replace(., ",", ""))) %>%
mutate(across(c('Streams', 'Daily', 'As lead', 'Solo', 'As feature'), ~ as.numeric(.) * 1000000)) %>%
left_join(
OSet_artists_genre,
by = "Artist",
suffix = c(".spotify", ".oset")
) %>%
drop_na() %>% # dropping rows that were not matched in left_join
select(
Artist, Genre, Streams
) %>%
group_by(Genre) %>%
count() %>%
arrange(desc(n))
})
head(preferable_genres_spotify)
## # A tibble: 6 × 2
## # Groups: Genre [6]
## Genre n
## <chr> <int>
## 1 pop 366
## 2 rap 205
## 3 rock 101
## 4 country 68
## 5 latin 39
## 6 r&b 36
ggplot(
head(preferable_genres_spotify),
aes(
x = reorder(Genre, -n),
y = n)
) + geom_bar(
stat = "identity",
width = 0.5,
fill = "#6cd980"
) + labs(
title = "Najczęściej słuchane gatunki na Spotify",
subtitle = "Globalnie"
) + xlab(
""
) + ylab(
""
) +
theme_fivethirtyeight() + scale_fill_fivethirtyeight() + theme(
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
)
# TODO pogrupowac ramki wedlug liczb z poszczegolnych serwisow
top_artists_spotify <- artists_genre_comparison %>%
select(Artist, Total.spotify) %>%
top_n(100, wt = Total.spotify)
# separate top artists for labeling
top_labels <- top_artists_spotify %>%
top_n(20, wt = Total.spotify)
bot_labels <- top_artists_spotify %>%
top_n(-18, wt = Total.spotify)
mid_labels <- top_artists_spotify %>%
slice(21:(nrow(top_artists_spotify) - 18))
top_artists_spotify %>%
ggplot(
aes(x = Artist, y = Total.spotify, size = log(Total.spotify))) +
geom_point(aes(alpha = Total.spotify), color = "#7e7b77", fill = "#6cd980", shape = 21) +
labs(
title = "Najczęściej streamowani artyści na Spotify",
subtitle = "Globalnie - sumaryzacja"
) +
xlab("Artysta") +
ylab("Ilość streamów") +
scale_size(range = c(1, 20)) +
scale_alpha_continuous(range = c(0.3, 1)) +
scale_y_log10() +
theme_fivethirtyeight() + scale_fill_fivethirtyeight() +
theme(
axis.text.x = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
panel.grid.minor = element_line(color = "grey95", linewidth = 0.25),
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
) +
geom_text(
data = top_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2.5,
color = "#1b2322"
)
# TODO zrepelować tylko te labely które trzeba
# TODO zrobić go dla polski
ggplotly(top_artists_spotify %>%
ggplot(aes(x = Artist, y = Total.spotify, size = log(Total.spotify))) +
geom_point(aes(alpha = Total.spotify), color = "#7e7b77", fill = "#6cd980", shape = 21) +
xlab("Artysta") +
ylab("Ilość streamów") +
labs(
title = "Najczęściej streamowani artyści na Spotify",
subtitle = "Globalnie - wersja interaktywna"
) +
scale_size(range = c(3, 15), name = "Sum of Listeners/Views") +
scale_alpha_continuous(range = c(0.3, 1)) +
scale_y_log10() +
theme_fivethirtyeight() +
scale_fill_fivethirtyeight() +
theme(
axis.text.x = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
panel.grid.minor = element_line(color = "grey95", linewidth = 0.25),
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
) +
geom_text(
data = top_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2.5,
color = "#1b2322"
) +
geom_text(
data = mid_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2,
color = "#1b2322"
) +
geom_text(
data = bot_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 1,
color = "#1b2322"
)
)
preferable_genres_apple <- suppressWarnings({
apple_songs_artisttotals %>%
mutate(across(c('Total', 'Today'), ~ str_replace(., ",", ""))) %>%
mutate(across(c('Total', 'Today'), ~ as.numeric(.) * 1000000)) %>%
left_join(
OSet_artists_genre,
by = "Artist",
suffix = c(".apple", ".oset")
) %>%
drop_na() %>% # dropping rows that were not matched in left_join
select(
Artist, Genre, Total
) %>%
group_by(Genre) %>%
count() %>%
arrange(desc(n))
})
preferable_genres_apple <- preferable_genres_apple[-6,] # removing empty row
head(preferable_genres_apple)
## # A tibble: 6 × 2
## # Groups: Genre [6]
## Genre n
## <chr> <int>
## 1 pop 29
## 2 rap 14
## 3 rock 9
## 4 adult standards 3
## 5 latin 3
## 6 australian dance 2
ggplot(
head(preferable_genres_apple),
aes(
x = reorder(Genre, -n),
y = n)
) + geom_bar(
stat = "identity",
width = 0.5,
fill = "#fa2a44"
) + labs(
title = "Najczęściej słuchane gatunki na Apple Music",
subtitle = "Globalnie"
) + xlab(
""
) + ylab(
""
) +
theme_fivethirtyeight() + scale_fill_fivethirtyeight() +
theme(
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
)
top_artists_apple <- artists_genre_comparison %>%
select(Artist, Total.apple) %>%
top_n(100, wt = Total.apple)
# separate top artists for labeling
top_labels <- top_artists_apple %>%
top_n(20, wt = Total.apple)
bot_labels <- top_artists_apple %>%
top_n(-18, wt = Total.apple)
mid_labels <- top_artists_apple %>%
slice(21:(nrow(top_artists_apple) - 18))
top_artists_apple %>%
ggplot(
aes(x = Artist, y = Total.apple, size = log(Total.apple))) +
geom_point(aes(alpha = Total.apple), color = "#7e7b77", fill = "#e04a5d", shape = 21) +
labs(
title = "Najczęściej streamowani artyści na Apple Music",
subtitle = "Globalnie - sumaryzacja"
) +
xlab("Artysta") +
ylab("Ilość streamów") +
scale_size(range = c(1, 20)) +
scale_alpha_continuous(range = c(0.3, 1)) +
scale_y_log10() +
theme_fivethirtyeight() + scale_fill_fivethirtyeight() +
theme(
axis.text.x = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
panel.grid.minor = element_line(color = "grey95", linewidth = 0.25),
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
) +
geom_text(
data = top_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2.5,
color = "#1b2322"
)
# TODO zrepelować tylko te labely które trzeba
# TODO zrobić go dla polski
ggplotly(top_artists_apple %>%
ggplot(aes(x = Artist, y = Total.apple, size = log(Total.apple))) +
geom_point(aes(alpha = Total.apple), color = "#7e7b77", fill = "#e04a5d", shape = 21) +
xlab("Artysta") +
ylab("Ilość streamów") +
labs(
title = "Najczęściej streamowani artyści na Apple Music",
subtitle = "Globalnie - wersja interaktywna"
) +
scale_size(range = c(3, 15), name = "Sum of Listeners/Views") +
scale_alpha_continuous(range = c(0.3, 1)) +
scale_y_log10() +
theme_fivethirtyeight() +
scale_fill_fivethirtyeight() +
theme(
axis.text.x = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
panel.grid.minor = element_line(color = "grey95", linewidth = 0.25),
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
) +
geom_text(
data = top_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2.5,
color = "#1b2322"
) +
geom_text(
data = mid_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2,
color = "#1b2322"
) +
geom_text(
data = bot_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 1,
color = "#1b2322"
)
)
preferable_genres_yt <- suppressWarnings({
youtube_archive %>%
mutate(across(c('Total', '100M'), ~ str_replace(., ",", ""))) %>%
mutate(across(c('Total', '100M'), ~ as.numeric(.) * 1000000)) %>%
left_join(
OSet_artists_genre,
by = "Artist",
suffix = c(".youtube", ".oset")
) %>%
drop_na() %>% # dropping rows that were not matched in left_join
select(
Artist, Genre, Total
) %>%
group_by(Genre) %>%
count() %>%
arrange(desc(n))
})
head(preferable_genres_yt)
## # A tibble: 6 × 2
## # Groups: Genre [6]
## Genre n
## <chr> <int>
## 1 pop 280
## 2 rap 112
## 3 rock 81
## 4 alternative metal 39
## 5 latin 32
## 6 country 28
ggplot(
head(preferable_genres_yt),
aes(
x = reorder(Genre, -n),
y = n)
) + geom_bar(
stat = "identity",
width = 0.5,
fill = "#db0000"
) + labs(
title = "Najczęściej słuchane gatunki na YouTube",
subtitle = "Globalnie"
) + xlab(
""
) + ylab(
""
) +
theme_fivethirtyeight() + scale_fill_fivethirtyeight() + theme(
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
)
top_artists_yt <- artists_genre_comparison %>%
select(Artist, Total.youtube) %>%
top_n(100, wt = Total.youtube)
# separate top artists for labeling
top_labels <- top_artists_yt %>%
top_n(20, wt = Total.youtube)
bot_labels <- top_artists_yt %>%
top_n(-18, wt = Total.youtube)
mid_labels <- top_artists_yt %>%
slice(21:(nrow(top_artists_yt) - 18))
top_artists_yt %>%
ggplot(
aes(x = Artist, y = Total.youtube , size = log(Total.youtube))) +
geom_point(aes(alpha = Total.youtube), color = "#7e7b77", fill = "#deb3b1", shape = 21) +
labs(
title = "Najczęściej streamowani artyści na YouTube",
subtitle = "Globalnie - sumaryzacja"
) +
xlab("Artysta") +
ylab("Ilość streamów") +
scale_size(range = c(1, 20)) +
scale_alpha_continuous(range = c(0.3, 1)) +
scale_y_log10() +
theme_fivethirtyeight() + scale_fill_fivethirtyeight() +
theme(
axis.text.x = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
panel.grid.minor = element_line(color = "grey95", linewidth = 0.25),
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
) +
geom_text(
data = top_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2.5,
color = "#1b2322"
)
# TODO zrepelować tylko te labely które trzeba
# TODO zrobić go dla polski
ggplotly(top_artists_yt %>%
ggplot(aes(x = Artist, y = Total.youtube, size = log(Total.youtube))) +
geom_point(aes(alpha = Total.youtube), color = "#7e7b77", fill = "#deb3b1", shape = 21) +
xlab("Artysta") +
ylab("Ilość streamów") +
labs(
title = "Najczęściej streamowani artyści na YouTube",
subtitle = "Globalnie - wersja interaktywna"
) +
scale_size(range = c(3, 15), name = "Sum of Listeners/Views") +
scale_alpha_continuous(range = c(0.3, 1)) +
scale_y_log10() +
theme_fivethirtyeight() +
scale_fill_fivethirtyeight() +
theme(
axis.text.x = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.grid.major = element_line(color = "grey90", linewidth = 0.5),
panel.grid.minor = element_line(color = "grey95", linewidth = 0.25),
plot.title = element_text(size = 13),
plot.subtitle = element_text(size = 10)
) +
geom_text(
data = top_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2.5,
color = "#1b2322"
) +
geom_text(
data = mid_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 2,
color = "#1b2322"
) +
geom_text(
data = bot_labels,
aes(label = Artist),
vjust = 0.5,
hjust = 0.5,
size = 1,
color = "#1b2322"
)
)